Make draws form arb dist



In [62]:

    
import itertools

import matplotlib.pyplot as plt
import matplotlib as mpl
from pymc3 import Model, Normal, Slice
from pymc3 import sample
from pymc3 import traceplot
from pymc3.distributions import Interpolated
from theano import as_op
import theano.tensor as tt
import numpy as np
from scipy import stats
import tqdm
import pandas as pd

%matplotlib inline

%load_ext version_information

%version_information pymc3, scipy









    



The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information






    Out[62]:




Software Version
Python 3.6.2 64bit [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]
IPython 6.1.0
OS Darwin 15.6.0 x86_64 i386 64bit
pymc3 3.1
scipy 0.19.1
Mon Sep 18 16:34:30 2017 MDT



In [52]:

    
np.random.seed(8675309)
x = stats.norm.rvs(loc=0, scale=2, size=100)
x = np.append(x, stats.norm.rvs(loc=4, scale=1, size=100))
plt.hist(x, 15, normed=False);



In [53]:

    
# fit a gaussian kde
kde = stats.gaussian_kde(x)

xx = np.linspace(-10,10, 100)
plt.hist(x, 15, normed=True);
plt.plot(xx, kde.evaluate(xx), c='r')









    Out[53]:





[<matplotlib.lines.Line2D at 0x120793a20>]

Brute force draw



In [54]:

    
def brute_force_draw(kde, xlim=(-10, 10), ylim=(0.0,.2), N=1000):
    samp = []
    while len(samp) < N:
        x = np.random.uniform(xlim[0], xlim[1], size=N*2)
        y = np.random.uniform(ylim[0], ylim[1], size=N*2)
        ind = kde.evaluate(x) >= y
        samp.extend(x[ind])
    samp = samp[:N]
    return  np.asarray(samp)



a = brute_force_draw(kde, N=30)
a, a.shape









    Out[54]:





(array([ 1.44720382, -1.71361794,  3.86976315,  3.25491413,  5.01536899,
        -3.92613852,  3.07717371, -2.58155544,  3.96060607,  2.58600679,
         4.76246047,  4.77344481, -1.73908732,  4.4657481 ,  5.46449549,
         1.34482137, -1.78096746, -0.95476455,  3.82152142,  3.42466047,
        -2.92877988, -0.49875371,  1.46424991,  0.01999758, -0.2549366 ,
         1.05183601,  1.93340442,  4.04922727,  4.57114483,  5.08852731]),
 (30,))



In [55]:

    
%timeit(brute_force_draw(kde, N=10))
%timeit(brute_force_draw(kde, N=100))
%timeit(brute_force_draw(kde, N=1000))
%timeit(brute_force_draw(kde, N=10000))









    



1.7 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
13.1 ms ± 377 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
25.2 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
166 ms ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)



In [56]:

    
plt.hist(x, 15, normed=True);
plt.plot(xx, kde.evaluate(xx), c='r')
plt.hist(brute_force_draw(kde, N=10000), 15, color='g', normed=True, alpha=0.3)









    Out[56]:





(array([ 0.00037604,  0.00329032,  0.00874285,  0.02340826,  0.04935477,
         0.07793353,  0.0920349 ,  0.10519617,  0.1230579 ,  0.14326985,
         0.16009748,  0.10538419,  0.03760364,  0.00968294,  0.00065806]),
 array([-7.09472796, -6.03100101, -4.96727407, -3.90354712, -2.83982018,
        -1.77609323, -0.71236629,  0.35136066,  1.4150876 ,  2.47881455,
         3.54254149,  4.60626844,  5.66999538,  6.73372233,  7.79744928,
         8.86117622]),
 <a list of 15 Patch objects>)

Do this in pymc3



In [57]:

    
def from_hist(param, kde, xlim=(-10,10), N=100):
    width = xlim[1] - xlim[0]
    x = np.linspace(xlim[0], xlim[1], N)
    y = kde.evaluate(x)
    
    # what was never sampled should have a small probability but not 0,
    # so we'll extend the domain and use linear approximation of density on it
    x = np.concatenate([[x[0] - 3 * width], x, [x[-1] + 3 * width]])
    y = np.concatenate([[0], y, [0]])
    return Interpolated(param, x, y)



In [58]:

    
model = Model()
with model:
    # Priors are posteriors from previous iteration
    dat = from_hist('dat', kde)

    # draw 10000 posterior samples
    trace = sample(10000)









    



Auto-assigning NUTS sampler...
Initializing NUTS using ADVI...
  0%|          | 0/200000 [00:00<?, ?it/s]
Convergence archived at 100
Interrupted at 100 [0%]: Average Loss = 9.7185
100%|██████████| 10500/10500 [00:08<00:00, 1202.53it/s]



In [59]:

    
traceplot(trace)









    Out[59]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1244d0c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x124587550>]], dtype=object)



In [60]:

    
plt.plot(xx, kde.evaluate(xx), c='r')
plt.hist(trace['dat'], 25, normed=True, color='g');



In [ ]:

For the heck of it benchmark the brute_force_draw()



In [95]:

    
df = pd.DataFrame({'iter':[], 'time':[]})



In [101]:

    
import time
for _ in range(10):
    for i in np.logspace(2, 6, 15).astype(int):
        t0 = time.time()
        brute_force_draw(kde, N=i)
        t = time.time() - t0
        df = df.append({'iter':i, 'time':t}, ignore_index=True)



In [102]:

    
df.head()



In [103]:

    
df.groupby('iter').mean()



In [104]:

    
plt.errorbar(np.logspace(2, 6, 15).astype(int), df.groupby('iter').mean().as_matrix(), 
             yerr=df.groupby('iter').std().as_matrix())
plt.xscale('log')
plt.yscale('log')
plt.xlabel('N')
plt.ylabel('Time [s]')









    



---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-104-6e43cdb68000> in <module>()
      1 plt.errorbar(np.logspace(2, 6, 15).astype(int), df.groupby('iter').mean().as_matrix(), 
----> 2              yerr=df.groupby('iter').std().as_matrix())
      3 plt.xscale('log')
      4 plt.yscale('log')
      5 plt.xlabel('N')

~/miniconda3/envs/python3/lib/python3.6/site-packages/matplotlib/pyplot.py in errorbar(x, y, yerr, xerr, fmt, ecolor, elinewidth, capsize, barsabove, lolims, uplims, xlolims, xuplims, errorevery, capthick, hold, data, **kwargs)
   2927                           xlolims=xlolims, xuplims=xuplims,
   2928                           errorevery=errorevery, capthick=capthick, data=data,
-> 2929                           **kwargs)
   2930     finally:
   2931         ax._hold = washold

~/miniconda3/envs/python3/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1896                     warnings.warn(msg % (label_namer, func.__name__),
   1897                                   RuntimeWarning, stacklevel=2)
-> 1898             return func(ax, *args, **kwargs)
   1899         pre_doc = inner.__doc__
   1900         if pre_doc is None:

~/miniconda3/envs/python3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in errorbar(self, x, y, yerr, xerr, fmt, ecolor, elinewidth, capsize, barsabove, lolims, uplims, xlolims, xuplims, errorevery, capthick, **kwargs)
   2905         if plot_line:
   2906             data_line = mlines.Line2D(x, y, **plot_line_style)
-> 2907             self.add_line(data_line)
   2908 
   2909         barcols = []

~/miniconda3/envs/python3/lib/python3.6/site-packages/matplotlib/axes/_base.py in add_line(self, line)
   1791             line.set_clip_path(self.patch)
   1792 
-> 1793         self._update_line_limits(line)
   1794         if not line.get_label():
   1795             line.set_label('_line%d' % len(self.lines))

~/miniconda3/envs/python3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _update_line_limits(self, line)
   1813         Figures out the data limit of the given line, updating self.dataLim.
   1814         """
-> 1815         path = line.get_path()
   1816         if path.vertices.size == 0:
   1817             return

~/miniconda3/envs/python3/lib/python3.6/site-packages/matplotlib/lines.py in get_path(self)
    987         """
    988         if self._invalidy or self._invalidx:
--> 989             self.recache()
    990         return self._path
    991 

~/miniconda3/envs/python3/lib/python3.6/site-packages/matplotlib/lines.py in recache(self, always)
    694 
    695         if len(x) != len(y):
--> 696             raise RuntimeError('xdata and ydata must be the same length')
    697 
    698         self._xy = np.empty((len(x), 2), dtype=np.float_)

RuntimeError: xdata and ydata must be the same length



In [ ]:

    
df.groupby('iter').std().as_matrix()



In [ ]:

	iter	time
0	100.0	0.028157
1	163.0	0.013186
2	268.0	0.013011
3	439.0	0.022207
4	719.0	0.017751

	time
iter
100.0	0.015709
163.0	0.016071
193.0	0.013737
268.0	0.018810
372.0	0.018922
439.0	0.019284
719.0	0.023510
1178.0	0.029916
1389.0	0.031592
1930.0	0.046104
2682.0	0.056344
3162.0	0.061267
5179.0	0.092376
8483.0	0.157581
10000.0	0.159811
13894.0	0.229134
19306.0	0.300504
22758.0	0.360196
37275.0	0.565721
61054.0	0.967196
71968.0	1.017366
100000.0	1.461324
138949.0	2.110246
268269.0	4.984222
517947.0	14.754781
1000000.0	24.656502

Software	Version
Python	3.6.2 64bit [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]
IPython	6.1.0
OS	Darwin 15.6.0 x86_64 i386 64bit
pymc3	3.1
scipy	0.19.1
Mon Sep 18 16:34:30 2017 MDT